Filter criteria:
Filter differentially expressed genes between autism and control (adj. p-value < 0.05 and lfc<log2(1.2))
No samples are removed based on network connectivity z-scores
# # FILTER DE GENES
# load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
#
# # Balance Groups by covariates, remove singular batches (none)
# to_keep = (datMeta$Subject_ID != 'AN03345') & !is.na(datMeta$Dx)
# table(to_keep)
# datMeta = datMeta[to_keep,]
# datExpr = datExpr[,to_keep]
#
# # Select genes differentially expressed in ASD
# mod = model.matrix(~ Dx, data=datMeta)
# corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
# lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
#
# fit = eBayes(lmfit, trend=T, robust=T)
# top_genes = topTable(fit, coef=2, number=nrow(datExpr))
# ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
#
# ASD_pvals = rownames(ordered_top_genes)[ordered_top_genes$adj.P.Val<0.05] # 6% of genes (3380)
# ASD_lfc = rownames(ordered_top_genes)[abs(ordered_top_genes$logFC)>log2(1.2)] # 13% of genes (6732)
# ASD_pvals_lfc = intersect(ASD_pvals, ASD_lfc) # 6% of genes (2928)
#
# datExpr = datExpr[match(ASD_pvals_lfc, rownames(datExpr)),]
#
# datProbes = datProbes[rownames(datProbes) %in% rownames(datExpr),]
#
# rm(to_keep, corfit, fit, lmfit, mod, ASD_pvals, ASD_lfc, ASD_pvals_lfc, top_genes)
#
# save(file='./../working_data/RNAseq_ASD_4region_DEgenes_vst_adj_pval_lfc.Rdata', datExpr, datMeta, datProbes, ordered_top_genes)
load('./../working_data/RNAseq_ASD_4region_DEgenes_vst_adj_pval_lfc.Rdata')
DE_info = ordered_top_genes
rm(ordered_top_genes)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 3071
## Number of samples: 86 (51 ASD, 35 controls)
First principal component explains 93% of the total variance
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = data.frame(datExpr)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1) %>% dplyr::select(ID)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 6 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)
Genes are still separated into two clouds of points:
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.
datMeta_backup = datMeta # So datMeta doesn't get replaced with unfiltered version
load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
datMeta = datMeta_backup
ASD_pvals = DE_info$adj.P.Val
log_fold_change = DE_info$logFC
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) +
theme_minimal() + theme(legend.position='bottom')
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() +
theme_minimal() + theme(legend.position='bottom') + scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)
rm(ASD_pvals, log_fold_change, top_genes, datExpr, datProbes, datMeta_backup, p1, p2)
Only 159 of our 3071 genes appear on the SFARI list, losing all 23/25 genes with a score of 1
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv',
col_types = cols(`number-of-reports` = col_integer(), syndromic = col_logical()))
Gene Score count considering all genes:
SFARI_genes$`gene-score` %>% table
## .
## 1 2 3 4 5 6
## 25 62 195 454 175 25
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>% mutate('gene-symbol'=hgnc_symbol) %>%
dplyr::select('ensembl_gene_id', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
gene_scores = SFARI_genes %>% dplyr::select(ensembl_gene_id, `gene-score`, syndromic) %>%
right_join(gene_names, by='ensembl_gene_id') %>% dplyr::select(-'gene-symbol')
# Full (ordered) list of datExpr genes with their scores
gene_scores = data.frame('ensembl_gene_id'=rownames(datExpr_redDim)) %>%
left_join(gene_scores, by = 'ensembl_gene_id') %>%
mutate('syndromic' = ifelse(syndromic==FALSE | is.na(syndromic), FALSE, TRUE),
'gene-bool' = ifelse(is.na(`gene-score`), FALSE, TRUE))
rm(SFARI_genes, mart, gene_names)
Gene score count for genes in datExpr
gene_scores$`gene-score` %>% table
## .
## 1 2 3 4 5 6
## 2 5 29 74 44 5
clusterings = list()
clusterings[['SFARI_score']] = gene_scores$`gene-score`
names(clusterings[['SFARI_score']]) = gene_scores$ensembl_gene_id
clusterings[['SFARI_bool']] = gene_scores$`gene-bool`
names(clusterings[['SFARI_bool']]) = gene_scores$ensembl_gene_id
clusterings[['syndromic']] = gene_scores$syndromic
names(clusterings[['syndromic']]) = gene_scores$ensembl_gene_id
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=7 as best number of clusters. SFARI genes seem to group in the last two clusters
h_clusts = datExpr_redDim %>% dist %>% hclust
# plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 7
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_score = clusterings[['SFARI_score']] %>% min(na.rm=TRUE)
max_score = clusterings[['SFARI_score']] %>% max(na.rm=TRUE)
viridis_score_cols = viridis(max_score - min_score + 1)
names(viridis_score_cols) = seq(min_score, max_score)
return(viridis_score_cols)
}
viridis_score_cols = create_viridis_dict()
dend_meta = gene_scores[match(labels(h_clusts), gene_scores$ensembl_gene_id),] %>%
mutate('SFARI_score' = viridis_score_cols[`gene-score`], # Purple: 2, Yellow: 6
'SFARI_bool' = ifelse(`gene-bool` == T, '#21908CFF', 'white'), # Acqua
'Syndromic' = ifelse(syndromic == T, 'orange', 'white')) %>%
dplyr::select(SFARI_score, SFARI_bool, Syndromic)
h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>%
set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Samples are grouped into two big clusters, two small clusters and two outliers, the first big cluster has one main subcluster, two small subclusters and three outliers, and the second one has one main subcluster, one small one and three groups of outliers.
*Output plots in clustering_genes_04_04 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.01 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])
Leaves most of the observations (~92%) without a cluster (12% more than cqn normalisation):
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2
## 2830 232 9
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
The soft R-squared value starts in 0.7 but then becomes tiny and needs a power of over 150 to recover. Taking power=1
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 10, by=1))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.7000 2.250 0.696 2360 2590 2670
## 2 2 0.4420 0.983 0.729 2040 2330 2460
## 3 3 0.2710 0.616 0.805 1830 2140 2320
## 4 4 0.1870 0.446 0.826 1670 2000 2220
## 5 5 0.1190 0.335 0.842 1550 1870 2130
## 6 6 0.0929 0.276 0.860 1460 1770 2050
## 7 7 0.0635 0.218 0.880 1370 1670 1990
## 8 8 0.0425 0.172 0.885 1300 1590 1930
## 9 9 0.0300 0.136 0.886 1240 1510 1880
## 10 10 0.0187 0.103 0.893 1190 1450 1830
network = datExpr_redDim %>% t %>% blockwiseModules(power=1, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It only leaves 68 genes without a cluster but classifies almost all points into the same class:
clusterings[['WGCNA']] %>% table
## .
## 0 1 2
## 68 2921 82
Number of clusters that resemble more Gaussian mixtures = 39
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 27
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Trying with 8 clusters
best_k = 8
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
## 1 2 3 4 5 6 7 8
## 292 520 455 581 265 286 330 342
Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.
# Modify equation sign if needed so blue cluter represents overexpressed genes
manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() +
geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
clusterings[['Manual']] %>% table
## .
## 0 1
## 1468 1603
Both clusters could be GM with 2 gaussians each in the means and 3 in the sd.
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
Separate clusters into two Gaussians per diagnosis by their mean:
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
n_clusters = 2
c1_mean = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(mean)
rownames(c1_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_mean = c1_mean %>% GMM(n_clusters)
c2_mean = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(mean)
rownames(c2_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_mean = c2_mean %>% GMM(n_clusters)
clusterings[['Manual_mean']] = as.character(clusterings[['Manual']])
clusterings[['Manual_mean']][clusterings[['Manual']]==0] = gmm_c1_mean$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_mean']][clusterings[['Manual']]==1] = gmm_c2_mean$Log_likelihood %>%
apply(1, function(x) glue('2_',which.max(x)))
plot_gaussians = manual_clusters_data %>% ggplot(aes(x=mean)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], # red
args=list(mean=gmm_c1_mean$centroids[1], sd=gmm_c1_mean$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], # green
args=list(mean=gmm_c1_mean$centroids[2], sd=gmm_c1_mean$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], # blue
args=list(mean=gmm_c2_mean$centroids[1], sd=gmm_c2_mean$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], # purple
args=list(mean=gmm_c2_mean$centroids[2], sd=gmm_c2_mean$covariance_matrices[2])) +
theme_minimal()
plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_mean']]))) +
geom_point() + theme_minimal()
grid.arrange(plot_gaussians, plot_points, ncol=2)
clusterings[['Manual_sd']] %>% table
## < table of extent 0 >
Separate clusters into three Gaussians per diagnosis by their sd:
n_clusters = 3
c1_sd = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(sd)
rownames(c1_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_sd = c1_sd %>% GMM(n_clusters)
c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(n_clusters)
clusterings[['Manual_sd']] = as.character(clusterings[['Manual']])
clusterings[['Manual_sd']][clusterings[['Manual']]==0] = gmm_c1_sd$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_sd']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>%
apply(1, function(x) glue('2_',which.max(x)))
plot_gaussians = manual_clusters_data %>% ggplot(aes(x=sd)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], #
args=list(mean=gmm_c1_sd$centroids[1], sd=gmm_c1_sd$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], #
args=list(mean=gmm_c1_sd$centroids[2], sd=gmm_c1_sd$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], #
args=list(mean=gmm_c1_sd$centroids[3], sd=gmm_c1_sd$covariance_matrices[3])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], #
args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[5], #
args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[6], #
args=list(mean=gmm_c2_sd$centroids[3], sd=gmm_c2_sd$covariance_matrices[3])) +
theme_minimal()
plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_sd']]))) +
geom_point() + theme_minimal()
grid.arrange(plot_gaussians, plot_points, ncol=2)
clusterings[['Manual_sd']] %>% table
## .
## 1_1 1_2 1_3 2_1 2_2 2_3
## 580 363 525 535 621 447
rm(c1_sd, c2_sd, gmm_c1_sd, gmm_c2_sd)
# Clean up the environment a bit
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data, c1_sd, c2_sd, c1_mean, c2_mean,
gmm_c1_sd, gmm_c2_sd,gmm_c1_sd, gmm_c1_mean, gmm_c2_mean, p1, p2, pca_data_projection, dend_meta,
plot_gaussians, plot_points, n_clusters, viridis_score_cols, gg_colour_hue, create_viridis_dict)
Using Adjusted Rand Index:
Clusterings seem to give very different results and none resembles the manual separation much
Manual separation + mean and + sd give similar clusterings to K-means, Hierarchical clustering, consensus clustering, ICA and Gaussian mixtures with a smaller number of Gaussians
Simple methods give similar results (K-means, hierarchical clustering, consensus clustering)
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
Simple clusterings consider only the 1st component
ICA seems to make a bit more sense than before, but it still classifies to little points
WGCNA doesn’t work well (classifies almost everything as a single class)
SFARI genes seem to be everywhere (perhaps a bit more concentrated in the top right?)
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), k_means = as.factor(clusterings[['km']]),
hc = as.factor(clusterings[['hc']]), cc_l1 = as.factor(clusterings[['cc_l1']]),
cc_l2 = as.factor(clusterings[['cc_l2']]), ica = as.factor(clusterings[['ICA_min']]),
n_ica = as.factor(rowSums(ICA_clusters)), gmm = as.factor(clusterings[['GMM']]),
gmm_2 = as.factor(clusterings[['GMM_2']]), wgcna = as.factor(clusterings[['WGCNA']]),
manual = as.factor(clusterings[['Manual']]), manual_mean = as.factor(clusterings[['Manual_mean']]),
manual_sd = as.factor(clusterings[['Manual_sd']]), SFARI = as.factor(clusterings[['SFARI_score']]),
SFARI_bool = as.factor(clusterings[['SFARI_bool']]), syndromic = as.factor(clusterings[['syndromic']])) %>%
bind_cols(DE_info[rownames(DE_info) %in% rownames(datExpr_redDim),])
selectable_scatter_plot(plot_points, plot_points)